registerDoParallel(cores = detectCores()-2)
This data is scraped every few months, it contains most all relevant information that Craigslist provides on car sales including columns like price, condition, manufacturer, latitude/longitude, and 18 other categories.
Its sample size is large enough to reflect the real market condition. It also contains various variables covering different status. Those variables can be converted into dummy variables or into levels. In our model, title status and transmission are converted into 1 and 0 levels while fuel types, vehicle types and drive train are converted into dummies.
At the end we can use text mining for description column to see what Americans truly want.
Data source: https://www.kaggle.com/datasets/austinreese/craigslist-carstrucks-data
Size 1.25GB, contains approximately 420,000 rows
Dataset along with submission will be the reduced size file.
#import original data
#craglistall = read.csv("Craglist.csv")
#craglistall[craglistall == ""] <- NA
#craglistall <- drop_na(craglistall)
#nrow(craglistall)
#export complete case data to csv, this will be the reduced size file for submission
#write.csv(craglistall,"craglistall.csv", row.names = FALSE)
#import complete case data
craglistcomplete = read.csv("craglistall.csv")
#recode cylinder and as.numeric for columns
craglistcomplete$cylinders[craglistcomplete$cylinders == "other"] <- "0"
craglistcomplete$cylinders <- as.numeric(gsub(" cylinders" ,"",craglistcomplete$cylinders))
craglistcomplete <- craglistcomplete%>%mutate_at(c('price', 'year', 'odometer'), as.numeric)
craglistcomplete$price[craglistcomplete$price <= 1000] <- NA
craglistcomplete$price[craglistcomplete$price >= 100000] <- NA
craglistcomplete$price[craglistcomplete$odometer >= 500000] <- NA
craglistcomplete$transmission[craglistcomplete$transmission == "other"] <- NA
craglistcomplete <- drop_na(craglistcomplete)
names(craglistcomplete)
## [1] "region" "price" "year" "manufacturer" "model"
## [6] "condition" "cylinders" "fuel" "odometer" "title_status"
## [11] "transmission" "drive" "size" "type" "paint_color"
## [16] "description" "state" "lat" "long"
#get condition levels, recode to condition score as follow, better contidion better score, 6 levels
craglistcomplete$condition <- as.integer(ifelse(craglistcomplete$condition == "new",6,ifelse(craglistcomplete$condition == "like new",5,ifelse(craglistcomplete$condition == "excellent",4,ifelse(craglistcomplete$condition == "good",3, ifelse(craglistcomplete$condition == "fair",2,ifelse(craglistcomplete$condition == "salvage",1,0)))))))
#get size levels, larger size higher socore, 4 levels
craglistcomplete$size <- as.integer(ifelse(craglistcomplete$size == "full-size",4,ifelse(craglistcomplete$size == "mid-size",3, ifelse(craglistcomplete$size == "compact",2,ifelse(craglistcomplete$size == "sub-compact",1,0)))))
#recode title_status in to clean == 1 , others == 0 (not clean)
craglistcomplete$title_status <- as.integer(ifelse(craglistcomplete$title_status == "clean", 1, 0))
#recode transmission in to auto == 1 , manual == 0
craglistcomplete$transmission <- as.integer(ifelse(craglistcomplete$transmission == "automatic", 1, 0))
#get dummies for "fuel", "type", "drive"
craglist.grp <- dummy_cols(craglistcomplete, select_columns=c("fuel", "type", "drive"), remove_selected_columns = TRUE)
#head(craglist.grp)
#there's column name that will cause trouble in random forest model
colnames(craglist.grp)[26] <- "type_minivan"
plot21<-ggplot(craglistcomplete)+geom_boxplot(mapping =aes(x=type, y=price), fill = "#FFDB6D", color = "#C4961A")
plot21
Fuel_Price <- craglistcomplete %>% group_by(fuel) %>% summarise(avgprice = mean(price))
ggplot(data = Fuel_Price) + geom_bar(mapping = aes(x = fuel, y = avgprice), stat = "identity", fill = "#F59811") + theme_light() + labs(x = "fuel type") + ggtitle("Avg price by fuel type") + theme(plot.title = element_text(hjust = 0.5))
#group by state and price
group261 <- subset(craglistcomplete, select = c(state, price))
#filtering for average
state_plot <- group261 %>% group_by(state) %>% summarise(avgprice = mean(price))
state_plot
## # A tibble: 51 × 2
## state avgprice
## <chr> <dbl>
## 1 ak 24296.
## 2 al 16438.
## 3 ar 15468.
## 4 az 14111.
## 5 ca 13998.
## 6 co 12261.
## 7 ct 11047.
## 8 dc 10300.
## 9 de 14253.
## 10 fl 14645.
## # … with 41 more rows
plot_usmap(data = state_plot , values = "avgprice", color = "coral") +
scale_fill_continuous(
low = "white", high = "red", name = "Average Price", label = scales::comma
) + theme(legend.position = "right")
#Google Map API access key
register_google(key = "AIzaSyDuBhfsCMjhNKhxV2J1lkQjF3vTRO372tQ")
#Get pittsburgh map
Pitt <- get_map(location = "Pittsburgh", zoom = 10, source="google", maptype = "roadmap")
#Get columbus map
Cbus <- get_map(location = "Columbus", zoom = 10, source="google", maptype = "roadmap")
PittMap <- ggmap(Pitt) + geom_point(aes(long, lat, color = drive), data = craglistcomplete) + ggtitle("Pittsburgh Drive Train") + xlab("Longitude") + ylab("Latitude") + theme(plot.title = element_text(hjust = 0.5))
CbusMap <- ggmap(Cbus) + geom_point(aes(long, lat, color = drive), data = craglistcomplete) + ggtitle("Columbus Drive Train") + xlab("Longitude") + ylab("Latitude") + theme(plot.title = element_text(hjust = 0.5))
PittMap; CbusMap
Fuel_Price <- craglistcomplete %>% group_by(type,year) %>% summarise_if(is.numeric, mean, na.rm=TRUE) %>% filter(year<=2020 & year>=2000)
plot231 <- ggplot(data=Fuel_Price, aes(x=year, y = price, group=type, fill=type)) + geom_area(alpha=0.4 , size=0.5, colour="black")
plot231
sdf <- craglistcomplete %>% group_by(type,year,region) %>% summarise(count_n = n(),avgprice = mean(price), avgodometer = mean(odometer)) %>% filter(year>=1950)
sdf
## # A tibble: 29,136 × 6
## # Groups: type, year [751]
## type year region count_n avgprice avgodometer
## <chr> <dbl> <chr> <int> <dbl> <dbl>
## 1 bus 1968 southeast missouri 1 5500 12345
## 2 bus 1970 sarasota-bradenton 1 32500 86653
## 3 bus 1975 dallas / fort worth 1 29999 100000
## 4 bus 1976 redding 1 5500 55207
## 5 bus 1978 kalispell 1 9500 0
## 6 bus 1978 kennewick-pasco-richland 1 25000 78000
## 7 bus 1978 oregon coast 1 25000 78000
## 8 bus 1979 phoenix 1 23800 1234
## 9 bus 1982 houston 1 11000 82000
## 10 bus 1984 fort collins / north CO 1 7950 136543
## # … with 29,126 more rows
plot232 <- ggplot(sdf %>% filter(year>=1950), aes(x = avgodometer, y = avgprice, color = type)) + geom_point(aes(size = count_n,frame = year)) + scale_x_log10() + ggtitle("Listed car price ~ Year and odometer") + theme(plot.title = element_text(hjust = 0.5))
# Using frame = year, we specify the datapoints for each frame
ggplotly(plot232)
#group by mfg and price
group233 <- subset(craglistcomplete, select = c(manufacturer, price))
#filtering for average
mfg_plot <- group233 %>% group_by(manufacturer) %>% summarise(avgprice = mean(price))
mfg_plot
## # A tibble: 41 × 2
## manufacturer avgprice
## <chr> <dbl>
## 1 acura 9901.
## 2 alfa-romeo 21796.
## 3 aston-martin 53367
## 4 audi 14416.
## 5 bmw 13478.
## 6 buick 9467.
## 7 cadillac 12655.
## 8 chevrolet 15866.
## 9 chrysler 8129.
## 10 datsun 13435.
## # … with 31 more rows
plot233 <- ggplot(data = mfg_plot) + geom_bar(aes(x = manufacturer, y = avgprice), show.legend = FALSE, stat = "identity", fill = "#79A5FF") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Manufacturer's car Avg Price") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
plot233
#group by
group234 <- subset(craglistcomplete, select = c(drive, price, region, transmission))
#filtering for average
drive_plot <- group234 %>% group_by(drive, region, transmission)%>% summarise(avgprice = mean(price)) %>% filter(region == "columbus" | region == "pittsburgh" | region == "cleveland" | region == "philadelphia" | region == "cincinnati" | region == "harrisburg")
plot234 <- ggplot(data = drive_plot) + geom_bar(mapping = aes(x = region, y = avgprice, fill = transmission), stat = "identity") + facet_wrap(~drive) + theme_light() + labs(x = "Drive train across cities, 1 is automatic transmission") + ggtitle("Average price by drive train") + theme(plot.title = element_text(hjust = 0.5)) + theme(axis.text.x = element_text(angle = 90))
plot234
#Remove variables not needed for modeling
craglist.sub <- subset(craglist.grp, select = -c(region, manufacturer, model, paint_color, description, state))
set.seed(114514, sample.kind = "Rejection")
split = sample(nrow(craglist.sub),0.8*nrow(craglist.sub))
train = craglist.sub[split,]
test = craglist.sub[-split,]
M <- cor(train)
corrplot(M, method = "color", tl.cex = 0.7)
#Linear model
lm <- lm(price ~ .- fuel_diesel - drive_fwd - type_sedan , data = train)
summary(lm)
##
## Call:
## lm(formula = price ~ . - fuel_diesel - drive_fwd - type_sedan,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34651 -4227 -745 3137 97538
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.473e+05 7.588e+03 -72.119 < 2e-16 ***
## year 2.777e+02 3.772e+00 73.621 < 2e-16 ***
## condition 1.895e+03 4.563e+01 41.537 < 2e-16 ***
## cylinders 1.276e+03 2.755e+01 46.329 < 2e-16 ***
## odometer -8.687e-02 5.664e-04 -153.381 < 2e-16 ***
## title_status 2.036e+03 1.325e+02 15.364 < 2e-16 ***
## transmission -1.740e+03 1.298e+02 -13.406 < 2e-16 ***
## size 7.027e+02 5.198e+01 13.518 < 2e-16 ***
## lat -2.249e+01 5.814e+00 -3.868 0.00011 ***
## long -4.021e+01 1.799e+00 -22.355 < 2e-16 ***
## fuel_electric -1.202e+03 8.471e+02 -1.419 0.15592
## fuel_gas -1.192e+04 1.367e+02 -87.215 < 2e-16 ***
## fuel_hybrid -9.647e+03 3.217e+02 -29.993 < 2e-16 ***
## fuel_other -1.263e+04 8.551e+02 -14.773 < 2e-16 ***
## type_bus 3.769e+03 8.215e+02 4.588 4.49e-06 ***
## type_convertible 4.264e+03 2.098e+02 20.318 < 2e-16 ***
## type_coupe 4.708e+03 1.608e+02 29.279 < 2e-16 ***
## type_hatchback 8.058e+01 1.751e+02 0.460 0.64533
## type_minivan 1.694e+03 2.162e+02 7.837 4.69e-15 ***
## type_offroad 4.810e+03 4.867e+02 9.883 < 2e-16 ***
## type_other 1.999e+03 4.126e+02 4.846 1.26e-06 ***
## type_pickup 4.146e+03 1.457e+02 28.444 < 2e-16 ***
## type_SUV 1.313e+03 1.021e+02 12.855 < 2e-16 ***
## type_truck 6.560e+03 1.278e+02 51.332 < 2e-16 ***
## type_van 4.064e+03 1.877e+02 21.649 < 2e-16 ***
## type_wagon 2.807e+02 2.384e+02 1.177 0.23906
## drive_4wd 4.471e+03 9.984e+01 44.778 < 2e-16 ***
## drive_rwd 1.901e+03 1.078e+02 17.633 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7622 on 58034 degrees of freedom
## Multiple R-squared: 0.5974, Adjusted R-squared: 0.5972
## F-statistic: 3189 on 27 and 58034 DF, p-value: < 2.2e-16
test$predlm = predict(lm, newdata = test)
train.mean = mean(train$price)
SSElm = sum((test$predlm - test$price)^2)
SSTlm = sum((train.mean - test$price)^2)
MAElm = mean(abs(test$price - test$predlm))
OSRlm = 1 - SSElm/SSTlm
OSRlm
## [1] 0.6002058
MAElm
## [1] 5270.548
set.seed(114514, sample.kind = "Rejection")
basetree = rpart(price ~ ., data=train, method="anova",minbucket=50,cp=0.05)
plotcp(basetree)
rpart.plot(basetree, digits=-5, fallen.leaves = T)
#Adjust the cp and minibucket
adjtree = rpart(price ~ ., data=train, method="anova",minbucket=5,cp=0.001)
rpart.plot(basetree, digits=-5, fallen.leaves = T)
test$predrt = predict(adjtree, newdata = test)
SSErt = sum((test$predrt - test$price)^2)
SSTrt = sum((train.mean - test$price)^2)
MAErt = mean(abs(test$price - test$predrt))
OSRrt = 1 - SSErt/SSTrt
OSRrt
## [1] 0.734258
MAErt
## [1] 4123.442
set.seed(114514, sample.kind = "Rejection")
baserf = randomForest(price~., data=train, ntree=300, nodesize=20, mtry=4)
x = train[,-1]
y = train$price
set.seed(114514, sample.kind="Rejection")
tuneRF(x, y, mtryStart = 4, stepFactor = 2, ntreeTry=300, nodesize=20, improve=0.01)
## mtry = 4 OOB error = 28614929
## Searching left ...
## mtry = 2 OOB error = 49289891
## -0.7225236 0.01
## Searching right ...
## mtry = 8 OOB error = 21468184
## 0.2497558 0.01
## mtry = 16 OOB error = 20520460
## 0.04414552 0.01
## mtry = 30 OOB error = 21022298
## -0.02445548 0.01
## mtry OOBError
## 2 2 49289891
## 4 4 28614929
## 8 8 21468184
## 16 16 20520460
## 30 30 21022298
#Tuned rf
rf.final = randomForest(price~., data = train, ntree=300, nodesize=20, mtry=16)
varImpPlot(rf.final)
test$predrf = predict(rf.final, newdata = test)
SSErf = sum((test$predrf - test$price)^2)
SSTrf = sum((train.mean - test$price)^2)
MAErf = mean(abs(test$price - test$predrf))
OSRrf = 1 - SSErf/SSTrf
OSRrf
## [1] 0.8630558
MAErf
## [1] 2576.654
par(mfrow=c(2,2))
partialPlot(rf.final, test , condition)
partialPlot(rf.final, test , cylinders)
partialPlot(rf.final, test , year)
partialPlot(rf.final, test , odometer)
set.seed(114514, sample.kind = "Rejection")
ctrl <- trainControl(method = "repeatedcv", repeats = 5)
# Repeat 5 k-fold cross-validation
xgb_ctrl <- train(price ~ ., data = train, method = "xgbTree",verbosity = 0, trControl = ctrl)
#visualizing XGB
xgb.plot.tree(model = xgb_ctrl$finalModel, trees = 1)
test$predxgb = predict(xgb_ctrl, newdata = test)
SSExgb = sum((test$predxgb - test$price)^2)
SSTxgb = sum((train.mean - test$price)^2)
MAExgb = mean(abs(test$price - test$predxgb))
OSRxgb = 1 - SSExgb/SSTxgb
OSRxgb
## [1] 0.8305857
MAExgb
## [1] 3169.467
table <- matrix(c(MAElm, OSRlm, MAErt, OSRrt, MAErf, OSRrf, MAExgb, OSRxgb), ncol = 2, byrow = T)
colnames(table) <- c('MAE', 'R-Square')
rownames(table) <- c('Linear Model','Regression Tree','Random Forest','XGBoost')
round(table, digits = 2)
## MAE R-Square
## Linear Model 5270.55 0.60
## Regression Tree 4123.44 0.73
## Random Forest 2576.65 0.86
## XGBoost 3169.47 0.83
craglist.text1 <- craglistcomplete$description
craglist.text1 <- gsub('[\t\n]','',craglist.text1)
craglist.text1 <- gsub('[^[:alnum:] ]','',craglist.text1)
craglist.text1 <- gsub('[[:digit:]]', '', craglist.text1)
Car_description <- data.frame(line = 1:nrow(craglistcomplete),as.character(craglist.text1))
Car_description$text <- Car_description$as.character.craglist.text1.
Car_description$as.character.craglist.text1. <- NULL
Car_description <- Car_description %>% unnest_tokens(word, text)
Car_description <- Car_description %>% anti_join(stop_words, by = c(word = "word"))
word_counts <- Car_description %>%
count(word, sort = TRUE)
#word_counts
wordcloud2(word_counts, color = "random-light", backgroundColor = "white")
Car_description %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>% # acast(): Cast A Molten Data Frame Into An Array Or Data Frame.
comparison.cloud(colors = c("#F8766D", "#00BFC4"),
max.words = 150)